Differential expression using the bulk RNAseq data from the May release.

R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

#path = "~/research/data/MorPhiC/May-2024/JAX_RNAseq2_ExtraEmbryonic/"
path = "../../MorPhiC/data/May-2024/JAX_RNAseq2_ExtraEmbryonic/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")

Read in meta data

meta = fread("data/JAX_RNAseq2_ExtraEmbryonic_meta_data.tsv", 
             data.table = FALSE)
dim(meta)
## [1] 82 32
meta[1:2,]
##   biomaterial_id lib_biomaterial_id differentiated_biomaterial_id
## 1   MOK20011-WT1         GT23-13719              PrS-MOK20011-WT1
## 2   MOK20011-WT2         GT23-13720              PrS-MOK20011-WT2
##   differentiated_biomaterial_description differentiation_protocol
## 1    KOLF2.2 derived primitive syncytium                 JAXPD001
## 2    KOLF2.2 derived primitive syncytium                 JAXPD001
##   differentiated_timepoint_value differentiated_timepoint_unit
## 1                              6                          days
## 2                              6                          days
##   differentiated_terminally_differentiated
## 1                                      Yes
## 2                                      Yes
##                                 model_organ ko_strategy ko_gene
## 1 extra-embryonic primitive syncytial cells          WT      WT
## 2 extra-embryonic primitive syncytial cells          WT      WT
##   library_preparation_protocol dissociation_protocol fragment_size input_amount
## 1                     JAXPL001                   N.A           417          300
## 2                     JAXPL001                   N.A           402          300
##   input_unit final_yield final_yield_unit lib_concentration
## 1        ngs       190.0              ngs          34.51784
## 2        ngs        91.2              ngs          17.18679
##   lib_concentration_unit PCR_cycle PCR_cycle_sample_index
## 1                     nM        10                     NA
## 2                     nM        10                     NA
##                                            file_id sequencing_protocol
## 1 RUNX1_WT1_GT23-13719_CCAAGTCT-TCATCCTT_S170_L003            JAXPS001
## 2 RUNX1_WT2_GT23-13720_TTGGACTC-CTGCTTCC_S157_L003            JAXPS001
##                   run_id biomaterial_description derived_cell_line_accession
## 1 20231124_23-robson-013                 KOLF2.2                    KOLF2.2J
## 2 20231124_23-robson-013                 KOLF2.2                    KOLF2.2J
##   clone_id protocol_id zygosity type model_system
## 1      WT1         N.A      N.A iPSC          PrS
## 2      WT2         N.A      N.A iPSC          PrS
names(meta)
##  [1] "biomaterial_id"                          
##  [2] "lib_biomaterial_id"                      
##  [3] "differentiated_biomaterial_id"           
##  [4] "differentiated_biomaterial_description"  
##  [5] "differentiation_protocol"                
##  [6] "differentiated_timepoint_value"          
##  [7] "differentiated_timepoint_unit"           
##  [8] "differentiated_terminally_differentiated"
##  [9] "model_organ"                             
## [10] "ko_strategy"                             
## [11] "ko_gene"                                 
## [12] "library_preparation_protocol"            
## [13] "dissociation_protocol"                   
## [14] "fragment_size"                           
## [15] "input_amount"                            
## [16] "input_unit"                              
## [17] "final_yield"                             
## [18] "final_yield_unit"                        
## [19] "lib_concentration"                       
## [20] "lib_concentration_unit"                  
## [21] "PCR_cycle"                               
## [22] "PCR_cycle_sample_index"                  
## [23] "file_id"                                 
## [24] "sequencing_protocol"                     
## [25] "run_id"                                  
## [26] "biomaterial_description"                 
## [27] "derived_cell_line_accession"             
## [28] "clone_id"                                
## [29] "protocol_id"                             
## [30] "zygosity"                                
## [31] "type"                                    
## [32] "model_system"
table(table(meta$biomaterial_id))
## 
##  1 
## 82
table(table(meta$lib_biomaterial_id))
## 
##  1 
## 82
table(table(meta$differentiated_biomaterial_id))
## 
##  1 
## 82

Read in gene count data and filter genes

cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    83
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name MEIS2_CE_A06_GT23-12174_TGCGAGAC-CAACAATG_S1_L001
## 1 ENSG00000268674                                                 0
## 2 ENSG00000271254                                               793
##   MEF2C_PTC_B03_GT23-13717_TCTCTACT-GAACCGCG_S177_L003
## 1                                                    0
## 2                                                  937
##   RUNX1_KO_G07_GT23-13724_CGGCGTGA-ACAGGCGC_S175_L003
## 1                                                   0
## 2                                                1079
stopifnot(setequal(names(cts)[-1], meta$file_id))

meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##   82
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

Read in gene annoation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)
## character(0)

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0          0        0.0       0
## ENSG00000271254 ENSG00000271254     594     512        797      958.5     858
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674    0.00       0       1
## ENSG00000271254 1051.75    1079    1695
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.5  
##  Mean   :   323.5   Mean   :   335.2   Mean   :   593.2   Mean   :   647.1  
##  3rd Qu.:   100.0   3rd Qu.:    71.0   3rd Qu.:   217.0   3rd Qu.:   177.5  
##  Max.   :240570.0   Max.   :209625.0   Max.   :883830.0   Max.   :382389.5  
##     ExM_q75             PrS_q75            ExM_max             PrS_max      
##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0   Min.   :     0  
##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      1.0   1st Qu.:     1  
##  Median :      3.8   Median :     3.0   Median :      8.0   Median :     8  
##  Mean   :    731.5   Mean   :   772.1   Mean   :   1063.6   Mean   :  1188  
##  3rd Qu.:    279.0   3rd Qu.:   217.6   3rd Qu.:    400.2   3rd Qu.:   330  
##  Max.   :1028917.5   Max.   :454140.8   Max.   :1675470.0   Max.   :773629
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12951  1098
##   TRUE   1885 22658
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25641    82
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

Check the effect of run id among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   428.0   510.0   554.0   574.9   649.0   727.0
dim(cts_dat_m)
## [1] 25641    21
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    21 24263
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.47564883 0.12518353 0.07086713 0.05336636 0.02717944
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 47.6 12.5  7.1  5.3  2.7
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=model_system, 
                        color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="KO gene", shape ="KO strategy") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

DE analysis with respect to model system for WT samples

## Generate sample information matrix

cols2kp = c("model_system", "q75")
sample_info = meta_m[,cols2kp]

dim(sample_info)
## [1] 21  2
sample_info[1:2,]
##    model_system q75
## 59          PrS 554
## 25          PrS 428
sample_info$log_q75 = log(sample_info$q75)

dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                  ~ model_system + log_q75)
dds = DESeq(dds)

## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
## [1] 25641     6
res_rd[1:2,]
##                    baseMean log2FoldChange     lfcSE       stat    pvalue
## ENSG00000271254 944.5427577      0.1298778 0.3327253  0.3903454 0.6962812
## ENSG00000276345   0.3686712     -5.3837662 9.7887890 -0.5499931 0.5823241
##                      padj
## ENSG00000271254 0.8840768
## ENSG00000276345        NA
table(is.na(res_rd$padj))
## 
## FALSE  TRUE 
## 14702 10939
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Read depth")
print(g0)

## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ + log_q75)
res_lrt = results(dds_lrt)

res_model_system = as.data.frame(res_lrt)
dim(res_model_system)
## [1] 25641     6
res_model_system[1:2,]
##                    baseMean log2FoldChange     lfcSE        stat       pvalue
## ENSG00000271254 944.5427577      0.1298778 0.3327253 31.92267820 1.604331e-08
## ENSG00000276345   0.3686712     -5.3837662 9.7887890 -0.01029894 1.000000e+00
##                       padj
## ENSG00000271254 4.7729e-08
## ENSG00000276345 1.0000e+00
table(is.na(res_model_system$padj))
## 
## FALSE  TRUE 
## 24642   999
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Model system")
print(g0)

Analyze samples of different model system

Explore the PC plots first, to decide running the models on what level of DE group.

Then run the DE analysis.

table(meta$run_id, meta$model_system)
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0  12
##   20231004_23-robson-011       22   0
##   20231124_23-robson-013       12  12
##   20240307_24-robson-002        0  12
##   20240307_24-robson-005        0  12
table(meta$run_id, meta$ko_gene)
##                              
##                               BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
##   20230809_23-robson-007-run2       0     0     0     0    9     0     0  3
##   20231004_23-robson-011            0     0     9     7    0     0     0  6
##   20231124_23-robson-013            0     9     0     0    0     0     9  6
##   20240307_24-robson-002            9     0     0     0    0     0     0  3
##   20240307_24-robson-005            0     0     0     0    0     9     0  3
table(meta$run_id, meta$model_system, meta$ko_gene)
## , ,  = BHLHE40
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        0   0
##   20231124_23-robson-013        0   0
##   20240307_24-robson-002        0   9
##   20240307_24-robson-005        0   0
## 
## , ,  = MEF2C
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        0   0
##   20231124_23-robson-013        9   0
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   0
## 
## , ,  = MEIS1
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        9   0
##   20231124_23-robson-013        0   0
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   0
## 
## , ,  = MEIS2
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        7   0
##   20231124_23-robson-013        0   0
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   0
## 
## , ,  = MXD1
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   9
##   20231004_23-robson-011        0   0
##   20231124_23-robson-013        0   0
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   0
## 
## , ,  = NCOA3
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        0   0
##   20231124_23-robson-013        0   0
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   9
## 
## , ,  = RUNX1
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   0
##   20231004_23-robson-011        0   0
##   20231124_23-robson-013        0   9
##   20240307_24-robson-002        0   0
##   20240307_24-robson-005        0   0
## 
## , ,  = WT
## 
##                              
##                               ExM PrS
##   20230809_23-robson-007-run2   0   3
##   20231004_23-robson-011        6   0
##   20231124_23-robson-013        3   3
##   20240307_24-robson-002        0   3
##   20240307_24-robson-005        0   3
table(meta$ko_strategy, meta$ko_gene)
##      
##       BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
##   CE        3     3     3     3    3     3     3  0
##   KO        3     3     3     1    3     3     3  0
##   PTC       3     3     3     3    3     3     3  0
##   WT        0     0     0     0    0     0     0 21
unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)

for(model1 in unique_model_ss){
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+"))
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(model1) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(model1) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene, 
                          alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
    geom_point(size=2.5) + 
    scale_color_brewer(palette = "Dark2") + 
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(model1) + guides(alpha = "none") + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  table(meta_m$run_id, meta_m$ko_gene)

}
## [1] "ExM"

## [1] "PrS"

Under model system “Exm”, the batch effect caused by run id seems to be big. Run DE analysis for different run ids separately.

Under model system “PrS”, still run analysis together.

For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.

In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.).

The first version of the output files contains gene id, and for each knockout strategy:

baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

The second version of the output files contains gene id,

chr, position, gene name

and for each knockout strategy:

log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4), 

In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.

df_size = NULL

output_dir = "results/2024_05_JAX_RNAseq2_ExtraEmbryonic"
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")

if (!file.exists(raw_output_dir)){
  dir.create(raw_output_dir, recursive = TRUE)
}

if (!file.exists(processed_output_dir)){
  dir.create(processed_output_dir, recursive = TRUE)
}


for(model1 in unique_model_ss){
  
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))
  
  ## Generate sample information matrix
  cols2kp = c("model_system", "ko_strategy", "ko_gene", "run_id", "q75")
  sample_info = meta_m[,cols2kp]
  
  dim(sample_info)
  sample_info[1:2,]
  
  print(table(sample_info$ko_strategy, sample_info$ko_gene))
  
  sample_info$ko_group = paste(sample_info$ko_gene, 
                               sample_info$ko_strategy, sep="_")
  print(table(sample_info$ko_group))
  print(length(unique(sample_info$ko_group)))
  print(table(sample_info$run_id, sample_info$ko_group))

  sample_info$log_q75 = log(sample_info$q75)
    
  # run DESeq2 for the two run ids separately for model system ExM
  if (model1 == "ExM"){ 
    sample_info$de_grp = paste0(sample_info$model_system, "_", sample_info$run_id)
  }else{
    sample_info$de_grp = sample_info$model_system
  }
  
  sorted_de_grps = sort(unique(sample_info$de_grp))
  
  for (d_group in sorted_de_grps){
    
    dg_index = which(sample_info$de_grp==d_group)
    cts_dat_m_dg = cts_dat_m[, dg_index]
    sample_info_dg = sample_info[dg_index, ]
    
    stopifnot(all(sample_info_dg$de_grp==d_group))
  
    print("-----------------------------------")
    print(paste0("DE analysis group: ", d_group))  
    if (sum(sample_info_dg$ko_gene=="WT")<2){
      print(sprintf("DE analysis group %s has only %d wild type samples", 
                    d_group, sum(sample_info_dg$ko_gene=="WT")))
      print(sprintf("do not run DESeq2 for it"))
      print("-----------------------------------")  
      break
    }else{
      print(sprintf("DE analysis group %s DESeq2 results:", d_group))
      print("-----------------------------------")  
    }    
    
    # do two steps of filtering
    # first, filter based on q75 of each gene
    # second, filter based on whether each gene is expressed in at least 4 cells
  
    dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
    cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
    
    print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                  length(dg_q75), nrow(cts_dat_m_dg)))
    
    n_pos = rowSums(cts_dat_m_dg>0)
    cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
    
    if (length(n_pos)==nrow(cts_dat_m_dg)){
      print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
    }else{
      print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                    length(n_pos), nrow(cts_dat_m_dg)))    
    }
  
    # update the q75 based on only genes used in the process
    sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
    sample_info_dg$log_q75 = log(sample_info_dg$q75)
    
    
    if (length(unique(sample_info_dg$run_id))==1){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + log_q75)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + run_id + log_q75)     
    }
    
    start.time <- Sys.time()
    dds = DESeq(dds)
    end.time <- Sys.time()
    
    time.taken <- end.time - start.time
    print(time.taken)
  
    ## association with read-depth
    res_rd = results(dds, name = "log_q75")
    res_rd = as.data.frame(res_rd)
    dim(res_rd)
    res_rd[1:2,]
  
    table(is.na(res_rd$padj))
  
    g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(d_group, " read depth"))
    print(g0)
  
    ## Rerun the analysis without read-depth if it is not significant for 
    ## most genes, and the number of samples is small
    ## i.e., proportion of non-null in the distribution is smaller than 0.01
    ## (this needs to be restricted to the genes with not NA adjusted pvalue)
    ## or if smaller than 0.1 and the number of samples is not greater than 6

    pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
    pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
    
    print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
    
    if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
      
      print("rerun DESeq2 without read depth")
      
      include_rd = 0
      if (length(unique(sample_info_dg$run_id))==1){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group)
      }else{
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + run_id)     
      }
      start.time <- Sys.time()
      dds = DESeq(dds)
      end.time <- Sys.time()
      
      time.taken <- end.time - start.time
      print(time.taken)
      
    }else{
      include_rd = 1
    }
    
    ## association with run_id
    
    if (length(unique(sample_info_dg$run_id))>1){
      
      if(include_rd){
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
      }else{
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
      }
      
      res_lrt = results(dds_lrt)
    
      res_run_id = as.data.frame(res_lrt)
      dim(res_run_id)
      res_run_id[1:2,]
    
      table(is.na(res_run_id$padj))
      
      g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(d_group, " Run ID"))
      print(g0)
      
    }
  
    ## DE analysis for each KO gene
    ## for knockout gene MEIS2 and knockout strategy KO
    ## there is only one knockout sample
    ## need to set the padj for all genes to NA for this setting of comparison 
    table(sample_info_dg$ko_gene)
    table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
    
    genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
    genos
  
    ko_grp  = c("CE", "KO", "PTC")
    
    for(geno in genos){
      
      res_geno_df = NULL
      res_geno_reduced_df = NULL
      res_geno = list()
        
      for(k_grp1 in ko_grp){
        
        res_g1 = results(dds, contrast = c("ko_group", 
                                           paste0(geno, "_", k_grp1), "WT_WT"))
        res_g1 = signif(data.frame(res_g1), 3)
        
        # add a column to record the number of zeros per test
        test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
        
        cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
        n_zero = rowSums(cts_dat_m_dg_test==0)
        res_g1$n_zeros = n_zero
        
        # record the number of samples involved in the comparison
        test_ko_group_vec = sample_info_dg$ko_group[test_index]
        
        df_size = rbind(df_size, 
                        c(d_group, 
                          paste0(geno, "_", k_grp1), 
                          sum(test_ko_group_vec!="WT_WT"), 
                          sum(test_ko_group_vec=="WT_WT")))
        
        # prepare a processed version of output
        res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
        res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
        
        if (sum(test_ko_group_vec!="WT_WT")==1){
          res_g1_reduced$padj = NA
        }
        
        res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
        
        res_geno[[k_grp1]] = res_g1_reduced
        
        if (sum(test_ko_group_vec!="WT_WT")>1){
          g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
            geom_histogram(color="darkblue", fill="lightblue", 
                           breaks = seq(0,1,by=0.01)) + 
            ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
          print(g1)
        }
  
        #tag1 = sprintf("%s_%s_%s_", d_group, geno, k_grp1)
        tag1 = sprintf("%s_%s_", geno, k_grp1)
        colnames(res_g1) = paste0(tag1, colnames(res_g1))
        colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
        
        if(is.null(res_geno_df)){
          res_geno_df = res_g1
        }else if(!is.null(res_geno_df)){
          stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
          res_geno_df = cbind(res_geno_df, res_g1)
        }

        if(is.null(res_geno_reduced_df)){
          res_geno_reduced_df = res_g1_reduced
        }else if(!is.null(res_geno_reduced_df)){
          stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
          res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
        }
        
      }
      
      get_index <- function(x){
        which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
      }
      
      # make an exception for the case of 
      # d_group = "ExM_20231004_23-robson-011"
      # geno = "MEIS2"
      if ((d_group=="ExM_20231004_23-robson-011")&(geno=="MEIS2")){

        w_ce = get_index(res_geno[["CE"]])
        w_ptc = get_index(res_geno[["PTC"]])

        print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
        print("There is no result for knock strategy KO due to number of knockout samples being only 1.")
        print(paste0("number of DE genes from knock out strategy CE: ", 
                     as.character(length(w_ce))))
        print(paste0("number of DE genes from knock out strategy PTC: ", 
                     as.character(length(w_ptc))))
        
        ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                          rownames(res_geno[["CE"]])[w_ce]))
        
        print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                     as.character(ptc_ce_overlap)))
        
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                               "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        
        print(g_up)
        
        df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                         padj_PTC = res_geno[["PTC"]]$padj)
    
        cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
        
        g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                              y = -log10(padj_CE))) +
            geom_pointdensity() + 
            ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
            scale_color_viridis()
        
        print(g4)
        
      }else{
        
        w_ce = get_index(res_geno[["CE"]])
        w_ko = get_index(res_geno[["KO"]])
        w_ptc = get_index(res_geno[["PTC"]])
    
        print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
        
        print(paste0("number of DE genes from knock out strategy CE: ", 
                     as.character(length(w_ce))))
        print(paste0("number of DE genes from knock out strategy KO: ", 
                     as.character(length(w_ko))))
        print(paste0("number of DE genes from knock out strategy PTC: ", 
                     as.character(length(w_ptc))))
        
        ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce], 
                                         rownames(res_geno[["KO"]])[w_ko]))
        
        ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko], 
                                          rownames(res_geno[["PTC"]])[w_ptc]))
        
        ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                          rownames(res_geno[["CE"]])[w_ce]))
        
        print(paste0("number of common DE genes by knock out strategies CE and KO: ", 
                     as.character(ce_ko_overlap)))
        print(paste0("number of common DE genes by knock out strategies KO and PTC: ", 
                     as.character(ko_ptc_overlap)))
        print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                     as.character(ptc_ce_overlap)))
            
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                  "KO" = rownames(res_geno[["KO"]])[w_ko], 
                  "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        print(g_up)
    
        df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                         padj_KO = res_geno[["KO"]]$padj, 
                         padj_PTC = res_geno[["PTC"]]$padj)
    
        cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
        cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
        
        g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                                y = -log10(padj_CE))) +
            geom_pointdensity() + ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
            scale_color_viridis()
        
        g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                              y = -log10(padj_KO))) +
          geom_pointdensity() + ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) + 
          scale_color_viridis()
        
        print(g4)
        print(g5)
      
      }
      
      dim(res_geno_df)
      res_geno_df[1:2,]
      
      dim(res_geno_reduced_df)
      res_geno_reduced_df[1:2,]
      
      res_df = data.frame(gene_ID=rownames(res_geno_df), 
                          res_geno_df)
      dim(res_df)
      res_df[1:2,]

      res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), gene_anno$ensembl_gene_id), ]
      stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
      
      # set gene hgnc_symbol that equal "" to NA
      hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
      hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
        
      res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df), 
                                  hgnc_symbol=hgnc_symbol_vec,
                                  chr=res_reduced_gene_anno$chr, 
                                  strand=res_reduced_gene_anno$strand, 
                                  start=res_reduced_gene_anno$start, 
                                  end=res_reduced_gene_anno$end, 
                                  res_geno_reduced_df)
      
      print("hgnc_symbol empty string and NA tables:")
      print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
      print(table(is.na(res_reduced_df$hgnc_symbol)))
      
      dim(res_reduced_df)
      res_reduced_df[1:2,]
      
      fwrite(res_df, 
             sprintf("%s/2024_05_JAX_RNAseq2_ExtraEmbryonic_%s_%s_DEseq2_raw.tsv", 
                     raw_output_dir, d_group, geno), 
             sep="\t")
      fwrite(res_reduced_df, 
             sprintf("%s/2024_05_JAX_RNAseq2_ExtraEmbryonic_%s_%s_DEseq2.tsv", 
                     processed_output_dir, d_group, geno), 
             sep="\t")
      
    }
    
  }
  
}
## [1] "ExM"
##      
##       MEF2C_CE MEF2C_KO MEF2C_PTC MEF2C_WT1 MEF2C_WT2 MEF2C_WT3 MEIS1_CE
##   CE         3        0         0         0         0         0        3
##   KO         0        3         0         0         0         0        0
##   PTC        0        0         3         0         0         0        0
##   WT         0        0         0         1         1         1        0
##      
##       MEIS1_KO MEIS1_PTC MEIS1_WT MEIS2_CE MEIS2_KO MEIS2_PTC MEIS2_WT
##   CE         0         0        0        3        0         0        0
##   KO         3         0        0        0        1         0        0
##   PTC        0         3        0        0        0         3        0
##   WT         0         0        3        0        0         0        3
## 
## TRUE 
##   34 
##      
##       MEF2C MEIS1 MEIS2 WT
##   CE      3     3     3  0
##   KO      3     3     1  0
##   PTC     3     3     3  0
##   WT      0     0     0  9
## 
##  MEF2C_CE  MEF2C_KO MEF2C_PTC  MEIS1_CE  MEIS1_KO MEIS1_PTC  MEIS2_CE  MEIS2_KO 
##         3         3         3         3         3         3         3         1 
## MEIS2_PTC     WT_WT 
##         3         9 
## [1] 10
##                         
##                          MEF2C_CE MEF2C_KO MEF2C_PTC MEIS1_CE MEIS1_KO
##   20231004_23-robson-011        0        0         0        3        3
##   20231124_23-robson-013        3        3         3        0        0
##                         
##                          MEIS1_PTC MEIS2_CE MEIS2_KO MEIS2_PTC WT_WT
##   20231004_23-robson-011         3        3        1         3     6
##   20231124_23-robson-013         0        0        0         0     3
## [1] "-----------------------------------"
## [1] "DE analysis group: ExM_20231004_23-robson-011"
## [1] "DE analysis group ExM_20231004_23-robson-011 DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 23897"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 40.93954 secs

## [1] "prop of non-null for rd: 2.32e-01"

## [1] "DE group ExM_20231004_23-robson-011 under KO gene MEIS2:"
## [1] "There is no result for knock strategy KO due to number of knockout samples being only 1."
## [1] "number of DE genes from knock out strategy CE: 44"
## [1] "number of DE genes from knock out strategy PTC: 176"
## [1] "number of common DE genes by knock out strategies PTC and CE: 40"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18953  4944 
## 
## FALSE  TRUE 
## 18953  4944

## [1] "DE group ExM_20231004_23-robson-011 under KO gene MEIS1:"
## [1] "number of DE genes from knock out strategy CE: 61"
## [1] "number of DE genes from knock out strategy KO: 76"
## [1] "number of DE genes from knock out strategy PTC: 50"
## [1] "number of common DE genes by knock out strategies CE and KO: 31"
## [1] "number of common DE genes by knock out strategies KO and PTC: 30"
## [1] "number of common DE genes by knock out strategies PTC and CE: 29"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18953  4944 
## 
## FALSE  TRUE 
## 18953  4944 
## [1] "-----------------------------------"
## [1] "DE analysis group: ExM_20231124_23-robson-013"
## [1] "DE analysis group ExM_20231124_23-robson-013 DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 24649"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24649 to 24060"
## Time difference of 12.74257 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 9.78231 secs

## [1] "DE group ExM_20231124_23-robson-013 under KO gene MEF2C:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18994  5066 
## 
## FALSE  TRUE 
## 18994  5066 
## [1] "PrS"
##      
##       BMLHE40_CE BMLHE40_KO BMLHE40_PTC BMLHE40_WT MDX1_CE MDX1_KO MDX1_KOLF2
##   CE           3          0           0          0       3       0          0
##   KO           0          3           0          0       0       3          0
##   PTC          0          0           3          0       0       0          0
##   WT           0          0           0          3       0       0          3
##      
##       MDX1_PTC NC0A3_CE NC0A3_KO NC0A3_PTC NC0A3_WT RUNX1_CE RUNX1_KO RUNX1_PTC
##   CE         0        3        0         0        0        3        0         0
##   KO         0        0        3         0        0        0        3         0
##   PTC        3        0        0         3        0        0        0         3
##   WT         0        0        0         0        3        0        0         0
##      
##       RUNX1_WT1 RUNX1_WT2 RUNX1_WT3
##   CE          0         0         0
##   KO          0         0         0
##   PTC         0         0         0
##   WT          1         1         1
## 
## TRUE 
##   48 
##      
##       BHLHE40 MXD1 NCOA3 RUNX1 WT
##   CE        3    3     3     3  0
##   KO        3    3     3     3  0
##   PTC       3    3     3     3  0
##   WT        0    0     0     0 12
## 
##  BHLHE40_CE  BHLHE40_KO BHLHE40_PTC     MXD1_CE     MXD1_KO    MXD1_PTC 
##           3           3           3           3           3           3 
##    NCOA3_CE    NCOA3_KO   NCOA3_PTC    RUNX1_CE    RUNX1_KO   RUNX1_PTC 
##           3           3           3           3           3           3 
##       WT_WT 
##          12 
## [1] 13
##                              
##                               BHLHE40_CE BHLHE40_KO BHLHE40_PTC MXD1_CE MXD1_KO
##   20230809_23-robson-007-run2          0          0           0       3       3
##   20231124_23-robson-013               0          0           0       0       0
##   20240307_24-robson-002               3          3           3       0       0
##   20240307_24-robson-005               0          0           0       0       0
##                              
##                               MXD1_PTC NCOA3_CE NCOA3_KO NCOA3_PTC RUNX1_CE
##   20230809_23-robson-007-run2        3        0        0         0        0
##   20231124_23-robson-013             0        0        0         0        3
##   20240307_24-robson-002             0        0        0         0        0
##   20240307_24-robson-005             0        3        3         3        0
##                              
##                               RUNX1_KO RUNX1_PTC WT_WT
##   20230809_23-robson-007-run2        0         0     3
##   20231124_23-robson-013             3         3     3
##   20240307_24-robson-002             0         0     3
##   20240307_24-robson-005             0         0     3
## [1] "-----------------------------------"
## [1] "DE analysis group: PrS"
## [1] "DE analysis group PrS DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25641 to 23756"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 47.80283 secs

## [1] "prop of non-null for rd: 5.05e-01"

## [1] "DE group PrS under KO gene NCOA3:"
## [1] "number of DE genes from knock out strategy CE: 29"
## [1] "number of DE genes from knock out strategy KO: 107"
## [1] "number of DE genes from knock out strategy PTC: 1"
## [1] "number of common DE genes by knock out strategies CE and KO: 17"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18951  4805 
## 
## FALSE  TRUE 
## 18951  4805

## [1] "DE group PrS under KO gene BHLHE40:"
## [1] "number of DE genes from knock out strategy CE: 429"
## [1] "number of DE genes from knock out strategy KO: 412"
## [1] "number of DE genes from knock out strategy PTC: 739"
## [1] "number of common DE genes by knock out strategies CE and KO: 298"
## [1] "number of common DE genes by knock out strategies KO and PTC: 337"
## [1] "number of common DE genes by knock out strategies PTC and CE: 320"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18951  4805 
## 
## FALSE  TRUE 
## 18951  4805

## [1] "DE group PrS under KO gene MXD1:"
## [1] "number of DE genes from knock out strategy CE: 120"
## [1] "number of DE genes from knock out strategy KO: 344"
## [1] "number of DE genes from knock out strategy PTC: 90"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 39"
## [1] "number of common DE genes by knock out strategies PTC and CE: 37"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18951  4805 
## 
## FALSE  TRUE 
## 18951  4805

## [1] "DE group PrS under KO gene RUNX1:"
## [1] "number of DE genes from knock out strategy CE: 1941"
## [1] "number of DE genes from knock out strategy KO: 351"
## [1] "number of DE genes from knock out strategy PTC: 2015"
## [1] "number of common DE genes by knock out strategies CE and KO: 173"
## [1] "number of common DE genes by knock out strategies KO and PTC: 188"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1098"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18951  4805 
## 
## FALSE  TRUE 
## 18951  4805
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
fwrite(df_size, 
       sprintf("%s/2024_05_JAX_RNAseq2_ExtraEmbryonic_DE_n_samples.tsv", 
                       output_dir), 
       sep="\t")
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  8135679 434.5   12621529 674.1         NA 12621529 674.1
## Vcells 35028473 267.3   79903722 609.7      65536 79903722 609.7
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.3                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.15.4          
##  [5] dplyr_1.1.4                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.5               viridisLite_0.4.2          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-60                
## [21] stringr_1.5.1               ggpubr_0.6.0               
## [23] ggrepel_0.9.5               ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-8           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.7             tools_4.2.3            backports_1.5.0       
##  [7] bslib_0.8.0            utf8_1.2.4             R6_2.5.1              
## [10] DBI_1.2.3              colorspace_2.1-1       GetoptLong_1.0.5      
## [13] withr_3.0.1            tidyselect_1.2.1       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.3         cli_3.6.3             
## [19] Cairo_1.6-2            DelayedArray_0.24.0    labeling_0.4.3        
## [22] sass_0.4.9             scales_1.3.0           digest_0.6.37         
## [25] rmarkdown_2.28         XVector_0.38.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.8.1      highr_0.11             fastmap_1.2.0         
## [31] GlobalOptions_0.1.2    rlang_1.1.4            rstudioapi_0.16.0     
## [34] RSQLite_2.3.7          farver_2.1.2           shape_1.4.6.1         
## [37] jquerylib_0.1.4        generics_0.1.3         jsonlite_1.8.8        
## [40] BiocParallel_1.32.6    car_3.1-2              RCurl_1.98-1.16       
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.9 Matrix_1.5-4.1        
## [46] Rcpp_1.0.13            munsell_0.5.1          fansi_1.0.6           
## [49] abind_1.4-5            lifecycle_1.0.4        stringi_1.8.4         
## [52] yaml_2.3.10            carData_3.0-5          zlibbioc_1.44.0       
## [55] plyr_1.8.9             blob_1.2.4             parallel_4.2.3        
## [58] crayon_1.5.3           lattice_0.22-6         Biostrings_2.66.0     
## [61] annotate_1.76.0        circlize_0.4.16        KEGGREST_1.38.0       
## [64] locfit_1.5-9.10        knitr_1.48             pillar_1.9.0          
## [67] rjson_0.2.21           ggsignif_0.6.4         geneplotter_1.76.0    
## [70] codetools_0.2-20       XML_3.99-0.17          glue_1.7.0            
## [73] evaluate_0.24.0        foreach_1.5.2          vctrs_0.6.5           
## [76] png_0.1-8              cellranger_1.1.0       gtable_0.3.5          
## [79] purrr_1.0.2            tidyr_1.3.1            clue_0.3-65           
## [82] cachem_1.1.0           xfun_0.47              xtable_1.8-4          
## [85] broom_1.0.6            rstatix_0.7.2          tibble_3.2.1          
## [88] iterators_1.0.14       AnnotationDbi_1.60.2   memoise_2.0.1         
## [91] cluster_2.1.6